from selenium import webdriver
from selenium.webdriver.common.by import By
from docx import Document

news_data = "2023年03月16日"

# 使用谷歌浏览器
driver = webdriver.Chrome()

# 打开网页
driver.get('http://cpc.people.com.cn/GB/443712/')

# 获取新闻列表
url_list = driver.find_elements(By.XPATH, '//div[@class="p2j_con02 clearfix g-w1200"]/div[@class="fl"]/ul[1]/li')

# 字符串替换
def replace_word(doc, tag, pv):
    # replace in paragraph

    for paragraph in doc.paragraphs:
        if tag not in paragraph.text:
            continue
        tmp = ''
        runs = paragraph.runs
        for i, run in enumerate(runs):
            tmp += run.text  # 合并run字符串
            if tag in tmp:
                # 如果存在匹配得字符串，那么将当前得run替换成合并后得字符串
                run.text = run.text.replace(run.text, tmp)
                run.text = run.text.replace(tag, pv)
                tmp = ''
            else:
                # 如果没匹配到目标字符串则把当前run置空
                run.text = run.text.replace(run.text, '')
            if i == len(runs) - 1:
                # 如果是当前段落一直没有符合规则得字符串直接将当前run替换为tmp
                run.add_text(tmp)




# 循环生成word 写入内容
for li in url_list:
    data_time = li.find_element(By.XPATH, "i").text  # 获取时间
    if data_time.replace("[", "").replace("]", "") == news_data:
        print("-----------------------------------------------获取新闻标题-----------------------------------------")
        title = li.find_element(By.XPATH, "a").text
        print(title)
        print("-----------------------------------------------获取新闻连接-----------------------------------------")
        url = li.find_element(By.XPATH, "a").get_attribute("href")
        print(url)
        print("-----------------------------------------------进入文章详情页-----------------------------------------")
        driver.get(url)

        print("-----------------------------------------------开始获取-----------------------------------------")
        print("-----------------------------------------------开始获取时间-----------------------------------------")
        riqi = driver.find_element(By.XPATH, '//div[@class="text_c"]/p[@class="sou"]').text
        x = riqi[0:11]
        y = x.replace("年", "-")
        z = y.replace("月", "-")
        shijian = z.replace("日", "")
        print(shijian)
        print("-----------------------------------------------获取时间结束-----------------------------------------")

        print("-----------------------------------------------开始获取作者-----------------------------------------")
        zuozhe = driver.find_element(By.XPATH, '//div[@class="text_c"]/p[@class="sou1"]').text
        print(zuozhe)
        print("-----------------------------------------------获取作者结束-----------------------------------------")

        print("-----------------------------------------------开始获取内容-----------------------------------------")
        content_list = driver.find_elements(By.XPATH, '//div[@class="show_text"]/p')
        text = ''
        i = 1
        for content in content_list:
            if i == 1:
                text = "　　" + content.text + "\n"
            elif i == len(content_list):
                text = text + "　　" + content.text
            else:
                text = text + "　　" + content.text + "\n"
            i = i + 1

        print(text)
        print("-----------------------------------------------获取内容结束--------------------------------------")
        print("-----------------------------------------------获取结束-----------------------------------------")

        print("==============================开始docx生成(写入 标题、时间、来源、作者)==============================")
        document = Document("../模拟点击/新闻模板.docx")  # 读取现有word文档，建立文档对象
        # document = Document()  # 新建文档对象

        replace_word(document, "{{标题}}", title)
        replace_word(document, "{{时间}}", shijian)
        replace_word(document, "{{来源}}", "人民网")
        replace_word(document, "{{作者}}", zuozhe)
        replace_word(document, "{{内容}}", text)

        document.save("../模拟点击/5、中共党史/" + title + ".docx")  # 保存文档
        print("==============================完成docx生成(写入 标题、时间、来源、作者)===============================")
        driver.back()  # 回退到上一个页面
# 关闭 网页
driver.close()



